# _*_ coding: utf-8 _*_
# @Date : 2023/3/10 21:03
# @Author : Paul
# @File : kmeans.py
# @Description : Kmeans聚类算法

import matplotlib.pyplot as plt
import sys
import json

from core.beans.param_train_result import ParamTrainResult
from core.utils.string_utils import StringUtils

from clusters.cluster import Cluster
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from core.beans.cluster_result import ClusterResult
from core.utils.color_util import ColorUtil
from core.utils.date_util import DateUtil
from core.utils.log_util import LogUtil


class TWKMeansAlgo(Cluster):

    def __init__(self,
                 app_name="clusters",
                 data_source_id=None,
                 table_name=None,
                 feature_cols=None,
                 param = None
                 ):
        """
        初始化类
        :param app_name:
        :param data_source_id:
        :param table_name:
        :param feature_cols:
        """
        super(TWKMeansAlgo, self).__init__(app_name=app_name,
                                           data_source_id=data_source_id,
                                           table_name=table_name,
                                           feature_cols=feature_cols,
                                           param=param)
        self.cluser = None
        # 聚类后质心
        self.centroid = None
        # 获取评估参数-轮廓函数
        self.silhouette_score = None
        self.IS_MODEL_EVAL = True  # 默认：不需要评估模型

    def initModel(self):
        """
        初始化模型
        """
        algoParam = self.param["algoParam"]
        self.n_clusters = 8 if StringUtils.isBlack(algoParam["nClusters"]) else int(algoParam["nClusters"])
        self.cluster = KMeans(n_clusters=self.n_clusters,
                              random_state=0)


    def buildModel(self, train_data):
        """
        训练模型
        """
        self.cluster = self.cluster.fit(train_data)
        # 重要属性lables_,查看聚好的类别，每个样本对应的类
        y_pred = self.cluster.labels_

        # 重要的参数cluster_centers_，查看质心
        self.centroid = self.cluster.cluster_centers_

        # 保存预测结果为图片
        if (len(self.feature_cols) == 2):
            fig, ax1 = plt.subplots(1)
            for i in range(self.n_clusters):
                ax1.scatter(train_data.iloc[y_pred == i, 0], train_data.iloc[y_pred == i, 1],
                            marker="o",  # 点的形状
                            s=8,  # 点的大小
                            c=ColorUtil.getRandomColor())
            ax1.scatter(self.centroid[:, 0], self.centroid[:, 1],
                        marker="x",
                        s=15,
                        c="black")
            plt.savefig(self.cluster_pred_image, dpi=300)
            # plt.show()
        elif (len(self.feature_cols) == 3):
            ax = plt.subplot(projection='3d')  # 创建一个三维的绘图工程
            ax.set_title('3d_image_show')  # 设置本图名称
            for i in range(self.n_clusters):
                ax.scatter(train_data.iloc[y_pred == i, 0], train_data.iloc[y_pred == i, 1], train_data.iloc[y_pred == i, 2],
                            marker="o",  # 点的形状
                            s=15,  # 点的大小
                            c=ColorUtil.getRandomColor())
            plt.savefig(self.cluster_pred_image, dpi=300)
            # plt.show()
        elif (len(self.feature_cols) > 3):
            ax = plt.subplot()  # 创建一个三维的绘图工程
            ax.set_title('3d_image_show')  # 设置本图名称
            for i in range(self.n_clusters):
                ax.scatter(self.train_data_dr[y_pred == i, 0],
                           self.train_data_dr[y_pred == i, 1],
                           # self.train_data_dr[y_pred == i, 2],
                           marker="o",  # 点的形状
                           s=15,  # 点的大小
                           c=ColorUtil.getRandomColor())
            plt.savefig(self.cluster_pred_image, dpi=300)
            # plt.show()


    def evalModel(self, train_data, test_data):
        """
        评估模型
        """
        # 获取评估参数-轮廓函数
        self.silhouette_score = silhouette_score(train_data, self.cluster.labels_)

        # 结束时间
        end_time = DateUtil.getCurrentDate()
        cost_second = DateUtil.diffMin(self.start_time, end_time)

        # 模型结果存入mysql
        classifier_result = ClusterResult(self.param["id"],
                                 "kmeans",
                                    self.param,
                                    self.app_name,
                                    self.info,
                                    self.describe,
                                    self.two_dim_dis_image,
                                    self.cluster_pred_image,
                                    self.centroid,
                                    self.silhouette_score,
                                    "success",
                                    self.start_time,
                                    end_time,
                                    cost_second)
        LogUtil.saveClusterResult(self.meta_data_source, classifier_result)

    def paramTrain(self):
        """
        超参数训练
        :return:
        """
        # 获取超参数训练参数
        param_train = self.param["paramTrain"]
        param_name = None if StringUtils.isBlack(param_train["paramName"]) else str(param_train["paramName"])
        param_start_value = None if StringUtils.isBlack(param_train["paramStartValue"]) else int(param_train["paramStartValue"])
        param_end_value = None if StringUtils.isBlack(param_train["paramEndValue"]) else int(param_train["paramEndValue"])
        param_range_value = None if StringUtils.isBlack(param_train["paramRangeValue"]) else int(param_train["paramRangeValue"])

        #测试数据
        train_data, test_data = self.getModelData()
        eval_value_list = []
        if param_name == None or param_start_value is None or param_end_value is None or param_range_value is None or param_start_value==1:
            error_info = "请确认参数必须为整数，且参数起始值不能为1"

            # 结束时间
            end_time = DateUtil.getCurrentDate()
            cost_second = DateUtil.diffMin(self.start_time, end_time)
            # 模型结果存入mysql
            param_train_result = ParamTrainResult(self.param["id"],
                                                  "kmeans",
                                                  self.param,
                                                  self.app_name,
                                                  error_info,
                                                  "failed",
                                                  self.start_time,
                                                  end_time,
                                                  cost_second)
            LogUtil.saveParamTrainResult(self.meta_data_source, param_train_result)
        elif param_name == "n_clusters":
            for n_clusters in range(param_start_value, param_end_value, param_range_value):
                self.cluster = KMeans(n_clusters=n_clusters,
                                      random_state=0)
                self.cluster = self.cluster.fit(train_data)
                silhouette_score_value = silhouette_score(train_data, self.cluster.labels_)
                eval_value_list.append(silhouette_score_value)
            # 保存结果
            param_train_image = self.image_path + "cluster_param_train_" + DateUtil.getCurrentDateSimple() + ".png"
            fig, ax = plt.subplots(1, 1)
            ax.set_title("聚类个数--轮廓函数--超参数学习曲线")
            ax.plot([i for i in range(param_start_value, param_end_value, param_range_value)], eval_value_list)
            plt.savefig(param_train_image, dpi=300)
            # plt.show()

            # 结束时间
            end_time = DateUtil.getCurrentDate()
            cost_second = DateUtil.diffMin(self.start_time, end_time)
            # 模型结果存入mysql
            param_train_result = ParamTrainResult(self.param["id"],
                                               "kmeans",
                                                self.param,
                                                self.app_name,
                                                param_train_image,
                                                "success",
                                                self.start_time,
                                                end_time,
                                                cost_second)
            LogUtil.saveParamTrainResult(self.meta_data_source, param_train_result)

if __name__ == '__main__':
    argv = sys.argv[1]
    # argv = "{\"algoParam\":{\"nClusters\":\"3\"},\"appName\":\"kmeans_1\",\"dataSourceId\":\"9\",\"featureCols\":[\"age\",\"income\",\"sex\"],\"id\":\"1680146960009\",\"preProcessMethodList\":[{\"preProcessFeature\":\"age\",\"preProcessMethod\":\"deletena\"}],\"standardization\":\"\",\"tableName\":\"data1\"}"
    param = json.loads(argv)
    app_name = param["appName"]
    data_source_id = param["dataSourceId"]
    table_name = param["tableName"]
    feature_cols = param["featureCols"]
    kmeans = TWKMeansAlgo(app_name=app_name,
                        data_source_id=data_source_id,
                        table_name=table_name,
                        feature_cols=feature_cols,
                        param = param)
    if "paramTrain" not in param.keys():
        kmeans.execute()
    else:
        kmeans.paramTrain()



